import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('salaries.csv')
# data = pd.read_csv('C:/Users/Neeraja/Desktop/IUB-DS/SEMESTER 3/DATA VIS/Data_Science_Fields_Salary_Categorization.csv')
data.head()
| work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | EN | FT | Data Analytics Engineer | 13000 | USD | 13000 | AR | 100 | AR | S |
| 1 | 2022 | SE | FT | Data Engineer | 100000 | USD | 100000 | US | 0 | US | M |
| 2 | 2022 | SE | FT | Data Engineer | 78000 | USD | 78000 | US | 0 | US | M |
| 3 | 2022 | SE | FT | Data Engineer | 120000 | USD | 120000 | US | 0 | US | M |
| 4 | 2022 | SE | FT | Data Engineer | 95000 | USD | 95000 | US | 0 | US | M |
data.shape
(1637, 11)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1637 entries, 0 to 1636 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 work_year 1637 non-null int64 1 experience_level 1637 non-null object 2 employment_type 1637 non-null object 3 job_title 1637 non-null object 4 salary 1637 non-null int64 5 salary_currency 1637 non-null object 6 salary_in_usd 1637 non-null int64 7 employee_residence 1637 non-null object 8 remote_ratio 1637 non-null int64 9 company_location 1637 non-null object 10 company_size 1637 non-null object dtypes: int64(4), object(7) memory usage: 140.8+ KB
data.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| work_year | 1637.0 | 2021.770922 | 0.518070 | 2020.0 | 2022.0 | 2022.0 | 2022.0 | 2022.0 |
| salary | 1637.0 | 223294.370800 | 985438.837723 | 5000.0 | 85000.0 | 130000.0 | 175100.0 | 30400000.0 |
| salary_in_usd | 1637.0 | 126509.493586 | 63103.689059 | 5000.0 | 80165.0 | 128000.0 | 168000.0 | 450000.0 |
| remote_ratio | 1637.0 | 58.827123 | 46.909032 | 0.0 | 0.0 | 100.0 | 100.0 | 100.0 |
data.dtypes
work_year int64 experience_level object employment_type object job_title object salary int64 salary_currency object salary_in_usd int64 employee_residence object remote_ratio int64 company_location object company_size object dtype: object
# data['salary_in_usd']=data['salary_in_usd'].apply(lambda x: int((x.split(".")[0]).replace(",","")))
data.dtypes
work_year int64 experience_level object employment_type object job_title object salary int64 salary_currency object salary_in_usd int64 employee_residence object remote_ratio int64 company_location object company_size object dtype: object
# data.drop(columns=data.columns[0], axis=1, inplace=True)
data.head()
| work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | EN | FT | Data Analytics Engineer | 13000 | USD | 13000 | AR | 100 | AR | S |
| 1 | 2022 | SE | FT | Data Engineer | 100000 | USD | 100000 | US | 0 | US | M |
| 2 | 2022 | SE | FT | Data Engineer | 78000 | USD | 78000 | US | 0 | US | M |
| 3 | 2022 | SE | FT | Data Engineer | 120000 | USD | 120000 | US | 0 | US | M |
| 4 | 2022 | SE | FT | Data Engineer | 95000 | USD | 95000 | US | 0 | US | M |
list(data.job_title.unique())
['Data Analytics Engineer', 'Data Engineer', 'Data Specialist', 'Data Analytics Consultant', 'Data Scientist', 'Data Analyst', 'Machine Learning Engineer', 'Machine Learning Software Engineer', '3D Computer Vision Researcher', 'ML Engineer', 'Machine Learning Researcher', 'Data Architect', 'Research Engineer', 'Machine Learning Research Engineer', 'Analytics Engineer', 'Data Analytics Manager', 'Data Science Consultant', 'Deep Learning Engineer', 'BI Data Analyst', 'Data Science Manager', 'Applied Scientist', 'Data Management Specialist', 'Research Scientist', 'Autonomous Vehicle Technician', 'Data Science Tech Lead', 'BI Analyst', 'Machine Learning Developer', 'Machine Learning Scientist', 'Data Scientist Lead', 'Data Manager', 'Cloud Data Engineer', 'Head of Data', 'Data Operations Analyst', 'Data Operations Engineer', 'Marketing Data Analyst', 'Data Science Lead', 'Power BI Developer', 'Product Data Scientist', 'Big Data Engineer', 'Principal Data Architect', 'Machine Learning Manager', 'Lead Data Scientist', 'Lead Machine Learning Engineer', 'NLP Engineer', 'ETL Developer', 'AI Scientist', 'Business Data Analyst', 'Applied Machine Learning Scientist', 'Data Engineering Manager', 'Director of Data Science', 'Financial Data Analyst', 'Computer Vision Software Engineer', 'Product Data Analyst', 'Machine Learning Infrastructure Engineer', 'Applied Data Scientist', 'Cloud Data Architect', 'Lead Data Engineer', 'Head of Machine Learning', 'Data Science Engineer', 'Head of Data Science', 'Computer Vision Engineer', 'Principal Data Analyst', 'Data Analytics Lead', 'Principal Data Scientist', 'Principal Data Engineer', 'Lead Data Analyst', 'Director of Data Engineering', 'Big Data Architect', 'Staff Data Scientist', 'Finance Data Analyst']
data_Jobtitles = " ".join(list(data['job_title'].apply(lambda a: "".join(a.split(" ")))))
from wordcloud import WordCloud
from PIL import Image
wordcloud = WordCloud(width=2000, height=1800, margin=0, colormap='Blues').generate(data_Jobtitles)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.margins(x=0, y=0)
plt.show()
data.hist( layout = (2,2), bins = 20,figsize = (10,10),)
array([[<AxesSubplot:title={'center':'work_year'}>,
<AxesSubplot:title={'center':'salary'}>],
[<AxesSubplot:title={'center':'salary_in_usd'}>,
<AxesSubplot:title={'center':'remote_ratio'}>]], dtype=object)
data.isnull().sum() # checking for null values
work_year 0 experience_level 0 employment_type 0 job_title 0 salary 0 salary_currency 0 salary_in_usd 0 employee_residence 0 remote_ratio 0 company_location 0 company_size 0 dtype: int64
Data Analysis
import seaborn as sns
# import altair as alt
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import plotly.express as px
residence = data['employee_residence'].value_counts()
top10_employee_location = residence[:10]
fig = px.bar(y=top10_employee_location.values,
x=top10_employee_location.index,
color = top10_employee_location.index,
text_auto=True,
title= 'Top 10 Location of Employee',
)
fig.update_layout(
xaxis_title="Location of Employee",
yaxis_title="count",
font = dict(size=17,family="Franklin Gothic"))
fig.show()
data = data[data['employee_residence'].str.contains("US")]
data.head()
| work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2022 | SE | FT | Data Engineer | 100000 | USD | 100000 | US | 0 | US | M |
| 2 | 2022 | SE | FT | Data Engineer | 78000 | USD | 78000 | US | 0 | US | M |
| 3 | 2022 | SE | FT | Data Engineer | 120000 | USD | 120000 | US | 0 | US | M |
| 4 | 2022 | SE | FT | Data Engineer | 95000 | USD | 95000 | US | 0 | US | M |
| 5 | 2022 | SE | FT | Data Specialist | 110000 | USD | 110000 | US | 0 | US | M |
plt.figure(figsize = (15,15))
plt.subplot(2,3,1)
sns.stripplot(x='employment_type', y='salary_in_usd', data=data)
plt.subplot(2,3,2)
sns.stripplot(x='company_size', y='salary_in_usd', data=data)
plt.subplot(2,3,3)
sns.stripplot(x='experience_level', y='salary_in_usd', data=data)
plt.subplot(2,3,4)
sns.boxplot(x="employment_type",y="salary_in_usd",data=data)
plt.subplot(2,3,5)
sns.boxplot(x="company_size",y="salary_in_usd",data=data)
plt.subplot(2,3,6)
sns.boxplot(x="experience_level",y="salary_in_usd",data=data)
<AxesSubplot:xlabel='experience_level', ylabel='salary_in_usd'>
plt.figure(figsize = (15,10))
#plt.figure(figsize = (35,25))
plt.subplot(2,3,1)
sns.swarmplot(x='employment_type', y='salary_in_usd', data=data)
plt.subplot(2,3,2)
sns.swarmplot(x='company_size', y='salary_in_usd', data=data)
plt.subplot(2,3,3)
sns.swarmplot(x='experience_level', y='salary_in_usd', data=data)
/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 64.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. /opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 71.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. /opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 12.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. /opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 76.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. /opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 36.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. /opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 6.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
<AxesSubplot:xlabel='experience_level', ylabel='salary_in_usd'>
plt.figure(figsize = (15,10))
#plt.figure(figsize = (35,25))
plt.subplot(2,3,1)
sns.violinplot(x='employment_type', y='salary_in_usd', data=data)
plt.subplot(2,3,2)
sns.violinplot(x='company_size', y='salary_in_usd', data=data)
plt.subplot(2,3,3)
sns.violinplot(x='experience_level', y='salary_in_usd', data=data)
<AxesSubplot:xlabel='experience_level', ylabel='salary_in_usd'>
plt.figure(figsize = (25,13))
plt.subplot(2,3,1)
sns.swarmplot(x='employment_type', y='salary_in_usd', data=data,hue="company_size")
plt.subplot(2,3,2)
sns.violinplot(x='employment_type', y='salary_in_usd', data=data,hue="company_size")
/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 39.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
<AxesSubplot:xlabel='employment_type', ylabel='salary_in_usd'>
# !pip install plotly
import plotly.express as px
#
fig=go.Figure(px.scatter(data, x=data["employment_type"], y=data["salary_in_usd"],color=data["employment_type"]))
fig.update_layout(
updatemenus=[
dict(buttons=list([
dict(
args=["type", "Scatter"],
label="Scatter Plot",
method="restyle"
),
dict(
args=["type", "violin"],
label="Violin Plot",
method="restyle"
),
dict(
args=["type", "box"],
label="box Chart",
method="restyle"
)
]),
direction="down",
),
]
)
fig.show()
fig=go.Figure(px.scatter(data, x=data["company_size"], y=data["salary_in_usd"],color=data["company_size"]))
fig.update_layout(
updatemenus=[
dict(buttons=list([
dict(
args=["type", "Scatter"],
label="Scatter Plot",
method="restyle"
),
dict(
args=["type", "violin"],
label="Violin Plot",
method="restyle"
),
dict(
args=["type", "box"],
label="box Chart",
method="restyle"
)
]),
direction="down",
),
]
)
fig.show()
fig=go.Figure(px.scatter(data, x=data["experience_level"], y=data["salary_in_usd"],color=data["experience_level"]))
fig.update_layout(
updatemenus=[
dict(buttons=list([
dict(
args=["type", "Scatter"],
label="Scatter Plot",
method="restyle"
),
dict(
args=["type", "violin"],
label="Violin Plot",
method="restyle"
),
dict(
args=["type", "box"],
label="box Chart",
method="restyle"
)
]),
direction="down",
),
]
)
fig.show()
remote_type = ['Fully Remote','Partially Remote','No Remote Work']
fig = go.Figure()
fig=px.bar(x = ['Fully Remote','Partially Remote','No Remote Work'],
y = data['remote_ratio'].value_counts().values,
color = remote_type,
#color_discrete_sequence=px.colors.sequential.dense,
text_auto=True,
title = 'Remote Working Ratio Distribution',
#template='plotly_dark'
)
fig.update_layout(
xaxis_title="Remote Type",
yaxis_title="count",
font = dict(size=17,family="Franklin Gothic")
)
# showing the plot
fig.show()
remote_type = ['Fully Remote','Partially Remote','No Remote Work']
fig = go.Figure()
fig=px.histogram(x = data["work_year"],nbins=5 ,color=data["remote_ratio"],
#color = remote_type,
#color_discrete_sequence=px.colors.sequential.dense,
text_auto=True,
title = 'Year Vs Remote Working ratio',
#template='plotly_dark'
)
fig.update_layout(
xaxis_title="Remote Type",
yaxis_title="count",
font = dict(size=17,family="Franklin Gothic")
)
# showing the plot
fig.show()
# Entery level/ Junior jobs that are remote and paying morethan median salary in Large Companies
data_en_remote_pay_G_Median_L = data[(data['experience_level'] == 'EN') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_en_remote_pay_G_Median_L
| job_title | company_location | |
|---|---|---|
| 939 | Data Analyst | US |
| 1007 | Machine Learning Developer | US |
| 1441 | Machine Learning Scientist | US |
sal_list = list(data[(data['experience_level'] == 'EN') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['salary_in_usd']].salary_in_usd)
sal_list
[150000, 180000, 225000]
# Interactive Vizualizations
remote_type = ['Fully Remote','Partially Remote','No Remote Work']
fig = go.Figure()
fig=px.bar(x = data_en_remote_pay_G_Median_L['job_title'],
y = sal_list,
color = data_en_remote_pay_G_Median_L['job_title'].value_counts().values,
#color_discrete_sequence=px.colors.sequential.dense,
text_auto=True,
title = 'Entery level/ Junior jobs that are remote and paying morethan median in Large Companies',
#template='plotly_dark'
)
fig.update_layout(
xaxis_title="Job Title",
yaxis_title="Salary",
font = dict(size=17,family="Franklin Gothic")
)
# showing the plot
fig.show()
# Senior-level / Expert jobs that are remort and paying morethan median in Large Companies
data_SE_remote_pay_G_Median_L = data[(data['experience_level'] == 'SE') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_SE_remote_pay_G_Median_L.head()
| job_title | company_location | |
|---|---|---|
| 559 | Lead Data Scientist | US |
| 581 | Data Engineer | US |
| 602 | Data Scientist Lead | US |
| 605 | Data Scientist | US |
| 606 | Data Scientist | US |
# Interactive Vizualizations
remote_type = ['Fully Remote','Partially Remote','No Remote Work']
fig = go.Figure()
fig=px.bar(x = data_SE_remote_pay_G_Median_L['job_title'].unique(),
y = data_SE_remote_pay_G_Median_L['job_title'].value_counts().values,
# color = remote_type,
#color_discrete_sequence=px.colors.sequential.dense,
text_auto=True,
title = 'Senior-level / Expert jobs that are remote and paying morethan median in Large Companies',
#template='plotly_dark'
)
fig.update_layout(
xaxis_title="Job Title",
yaxis_title="count",
font = dict(size=17,family="Franklin Gothic")
)
# showing the plot
fig.show()
# Mid-level / Intermediate that are remort and paying morethan median in Large Companies
data_MI_remote_pay_G_Median_L = data[(data['experience_level'] == 'MI') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_MI_remote_pay_G_Median_L.head()
| job_title | company_location | |
|---|---|---|
| 516 | ML Engineer | US |
| 664 | Business Data Analyst | US |
| 1008 | Data Scientist | US |
| 1108 | Machine Learning Scientist | US |
| 1309 | Applied Data Scientist | US |
# Interactive Vizualizations
remote_type = ['Fully Remote','Partially Remote','No Remote Work']
fig = go.Figure()
fig=px.bar(x = data_MI_remote_pay_G_Median_L['job_title'].unique(),
y = data_MI_remote_pay_G_Median_L['job_title'].value_counts().values,
# color = remote_type,
#color_discrete_sequence=px.colors.sequential.dense,
text_auto=True,
title = 'Mid-level / Intermediate jobs that are remote and paying morethan median in Large Companies',
#template='plotly_dark'
)
fig.update_layout(
xaxis_title="Job Title",
yaxis_title="count",
font = dict(size=17,family="Franklin Gothic")
)
# showing the plot
fig.show()
# Executive-level / Director jobs that are remort and paying morethan median in Large Companies
data_EX_remote_pay_G_Median_L = data[(data['experience_level'] == 'EX') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_EX_remote_pay_G_Median_L
sal_list1 = list(data[(data['experience_level'] == 'EX') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['salary_in_usd']].salary_in_usd)
sal_list1
[235000, 325000]
# Interactive Vizualizations
remote_type = ['Fully Remote','Partially Remote','No Remote Work']
fig = go.Figure()
fig=px.bar(x = data_EX_remote_pay_G_Median_L['job_title'].unique(),
y = sal_list1,
# color = remote_type,
#color_discrete_sequence=px.colors.sequential.dense,
text_auto=True,
title = 'Executive-level / Director jobs that are remote and paying morethan median in Large Companies',
#template='plotly_dark'
)
fig.update_layout(
xaxis_title="Job Title",
yaxis_title="Salary",
font = dict(size=17,family="Franklin Gothic")
)
# showing the plot
fig.show()
# Entery level/ Junior jobs that are `remote` and paying morethan median
data_en_remote_pay_G_Median = data[(data['experience_level'] == 'EN') & (data['remote_ratio']==100) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_en_remote_pay_G_Median
# Senior-level / Expert jobs that are remote and paying morethan median
data_SE_remote_pay_G_Median = data[(data['experience_level'] == 'SE') & (data['remote_ratio']==100) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_SE_remote_pay_G_Median.head()
# Mid-level / Intermediate that are remote and paying morethan median
data_MI_remote_pay_G_Median = data[(data['experience_level'] == 'MI') & (data['remote_ratio']==100) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_MI_remote_pay_G_Median.head()
# Executive-level / Director jobs that are remote and paying morethan median
data_EX_remote_pay_G_Median = data[(data['experience_level'] == 'EX') & (data['remote_ratio']==100) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_EX_remote_pay_G_Median.head()
| job_title | company_location | |
|---|---|---|
| 158 | Data Engineer | US |
| 159 | Data Engineer | US |
| 401 | Analytics Engineer | US |
| 402 | Analytics Engineer | US |
| 590 | Data Engineer | US |
# Entry level/ Junior jobs that are `partially remote` and paying morethan median
data_en_ParRemote_pay_G_Median = data[(data['experience_level'] == 'EN') & (data['remote_ratio']==50) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_en_ParRemote_pay_G_Median
# Senior-level / Expert jobs that are remote and paying morethan median
data_SE_ParRemote_pay_G_Median = data[(data['experience_level'] == 'SE') & (data['remote_ratio']==50) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_SE_ParRemote_pay_G_Median.head()
# Mid-level / Intermediate that are remote and paying morethan median
data_MI_ParRemote_pay_G_Median = data[(data['experience_level'] == 'MI') & (data['remote_ratio']==50) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_MI_ParRemote_pay_G_Median.head()
# Executive-level / Director jobs that are remote and paying morethan median
data_EX_ParRemote_pay_G_Median = data[(data['experience_level'] == 'EX') & (data['remote_ratio']==50) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_EX_ParRemote_pay_G_Median
| job_title | company_location |
|---|
# Entery level/ Junior jobs that are `not remote` and paying morethan median
data_en_NotRemote_pay_G_Median = data[(data['experience_level'] == 'EN') & (data['remote_ratio']==0) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_en_NotRemote_pay_G_Median
# Senior-level / Expert jobs that are not remote and paying morethan median
data_SE_NotRemote_pay_G_Median = data[(data['experience_level'] == 'SE') & (data['remote_ratio']==0) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_SE_NotRemote_pay_G_Median.head()
# Mid-level / Intermediate that are not remote and paying morethan median
data_MI_NotRemote_pay_G_Median = data[(data['experience_level'] == 'MI') & (data['remote_ratio']==0) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_MI_NotRemote_pay_G_Median.head()
# Executive-level / Director jobs that are not remote and paying morethan median
data_EX_NotRemote_pay_G_Median = data[(data['experience_level'] == 'EX') & (data['remote_ratio']==0) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_EX_NotRemote_pay_G_Median.head()
| job_title | company_location | |
|---|---|---|
| 325 | Head of Data | US |
| 326 | Head of Data | US |
| 706 | Data Science Manager | US |
| 707 | Data Science Manager | US |
| 947 | Data Science Manager | US |
# Different types of Employment roles
# FullTime vs Contract jobs
data_fulltimevsParttime = data[['employment_type', 'salary_in_usd']].groupby('employment_type').count().rename(columns={'salary_in_usd': 'No_of_jobs'}).sort_values('No_of_jobs', ascending=False)
data_fulltimevsParttime.reset_index(inplace=True)
data_fulltimevsParttime.head()
| employment_type | No_of_jobs | |
|---|---|---|
| 0 | FT | 1167 |
| 1 | CT | 5 |
# Jobs that pay morethan median
data_JobsMedian = data[['job_title', 'salary']].groupby('job_title').count().rename(columns={'salary': 'No_of_positions'}).sort_values('No_of_positions', ascending=False)
# df_dec_pos.reset_index(inplace=True)
data_JobsMedian.head()
| No_of_positions | |
|---|---|
| job_title | |
| Data Engineer | 333 |
| Data Scientist | 298 |
| Data Analyst | 198 |
| Machine Learning Engineer | 65 |
| Data Architect | 43 |
correlation = data.corr().round(2)
fig = px.imshow(correlation, text_auto=True)
fig.show()
data1=data.groupby(['job_title']).size() .sort_values(ascending=False) .reset_index(name='count')
data2=data1.head(10)
x=data2["job_title"]
y=data2["count"]
fig = go.Figure()
fig=px.bar(x = x,
y =y,
color = x,
#color_discrete_sequence=px.colors.sequential.dense,
text_auto=True,
title = 'Top 10 Job Titles',
#template='plotly_dark'
)
fig.update_layout(
xaxis_title="job_title",
yaxis_title="Count",
font = dict(size=17,family="Franklin Gothic")
)
# showing the plot
fig.show()
data1=data.groupby(['salary_in_usd','job_title']).size().reset_index( )
data2=(data1[-15:])
fig1 = go.Figure()
fig1=px.bar(x=data2['job_title'],y=data2['salary_in_usd'],color=data2['salary_in_usd'],
#color = remote_type,
#color_discrete_sequence=px.colors.sequential.dense,
text_auto=True,
# title = 'Remote Working Ratio Distribution',
#template='plotly_dark'
)
fig1.update_layout(
xaxis_title="job_title",
yaxis_title="Salary",
font = dict(size=17,family="Franklin Gothic")
)
# showing the plot
fig1.show()